Versions

V20 - Using SJC disease designations for cohort assignment

V19 - updated to survey_data.2020-07-08_17.17.10.txt, then survey_data.2020-07-14_11.31.11.txt

V18 - updated to survey_data.2020-07-02_11.01.36.txt

V17 - Change sample to dataset; change project to cohort

V16 - use namer on chunks

V15 - change correlation plots to small dots

V14 - combine plots

V13 - change gene count threshold from categporical to numeric

V12 - exclude datasets with fewer than 100 measured genes and use pre-generated survey_data

V11 - color consistency

V10 - calculate non genic and all exonic counts for correlations analysis

V9 - add gene count; import readdist.txt for more nuanced comparison of cor with gene count

Overview

reference range for unmapped reads is calculated based on unmapped reads/total

reference range for duplicate reads is calculated based on duplicate reads/mapped reads where mapped reads = total mapped and multi-mapped (<20x) reads)

reference range for non-exonic reads is calculated based on non-exonic/non duplicate reads) where non duplicate reads = exonic and non-exonic non dupe reads)

The composition fractions are not dependent on higher level ones

Libraries

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(viridis)
## Loading required package: viridisLite
library(knitr)
library(ggthemes) # for theme_linedraw() 
library(ggpubr) # for stat_cor
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(cowplot) # for ggdraw and draw_label
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library(RColorBrewer)
library(pander)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Config

inputs_dir <- "inputs"
data_file <- tibble(files = list.files(inputs_dir)) %>%
  arrange(desc(files)) %>%
  top_n(1, wt = files) %>%
  pull(files)

print(data_file)
## [1] "survey_data.2020-07-15_18.51.47.txt"

Settings

min_genes_gt_0 <- 100

Functions

is_outlier <- function(x) {
  (x > quantile(x, 0.75) + 1.5*IQR(x)) |
    (x < quantile(x, 0.25) - 1.5*IQR(x))
}


most_common_val_w_percent <- function(value_vector = c("a", "a", "b")){
  mc_vals <- tabyl(value_vector) %>%
    arrange(desc(n)) %>%
    top_n(1, n) 
  paste0(round(100*mc_vals$percent), "% ", mc_vals$value_vector)
}

# StatBin2 allows depiction of empty bins as blank instead of a horizontal line:
# https://stackoverflow.com/questions/57128090/remove-baseline-color-for-geom-histogram
StatBin2 <- ggproto(
  "StatBin2", 
  StatBin,
  compute_group = function (data, scales, binwidth = NULL, bins = NULL, 
                            center = NULL, boundary = NULL, 
                            closed = c("right", "left"), pad = FALSE, 
                            breaks = NULL, origin = NULL, right = NULL, 
                            drop = NULL, width = NULL) {
    if (!is.null(breaks)) {
      if (!scales$x$is_discrete()) {
        breaks <- scales$x$transform(breaks)
      }
      bins <- ggplot2:::bin_breaks(breaks, closed)
    }
    else if (!is.null(binwidth)) {
      if (is.function(binwidth)) {
        binwidth <- binwidth(data$x)
      }
      bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth, 
                                         center = center, boundary = boundary, 
                                         closed = closed)
    }
    else {
      bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins, 
                                        center = center, boundary = boundary, 
                                        closed = closed)
    }
    res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)

    # drop 0-count bins completely before returning the dataframe
    res <- res[res$count > 0, ] 

    res
  })

Import data

col_spec <- cols(
  .default = col_double(),
  dataset_id = col_character(),
  disease = col_character(),
  pedaya = col_character(),
  gender = col_character(),
  cohort_name = col_character(),
  cohort_code = col_character(),
  doi_link = col_character(),
  source_repository = col_character(),
  repository_cohort_accession = col_character(),
  repository_dataset_accession = col_character()
)

counts_and_meta <- read_tsv(file.path(inputs_dir, data_file), col_types = col_spec) %>%
  group_by(cohort_code) %>%
  mutate(n_in_cohort = n())

Colors

# brewer.pal(8, "Paired")

these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
"#FDBF6F", "#FF7F00")

# light blue, dark blue, etc: green, red, orange (then purple and brown)

these_colors <- c("#1F78B4", "#1F78B4", "#33A02C", "#33A02C", "#E31A1C", "#E31A1C", 
"#FF7F00", "#FF7F00")

names(these_colors) <-  c("not assigned", "Total reads", "Non-exonic reads",  "Exonic reads", "Duplicate reads",  "Non-duplicate reads",  "Unmapped reads", "Mapped reads")

these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")

Datasets analyzed

nrow(counts_and_meta)
## [1] 2179

Diseases in dataset

counts_and_meta %>% tabyl(disease)
##                                     disease   n      percent
##                acute lymphoblastic leukemia 680 0.3120697568
##             acute megakaryoblastic leukemia 103 0.0472693896
##                      acute myeloid leukemia 221 0.1014226709
##             acute undifferentiated leukemia   1 0.0004589261
##                      adrenocortical adenoma   1 0.0004589261
##                       adrenocortical cancer   2 0.0009178522
##                    adrenocortical carcinoma  20 0.0091785223
##                   alveolar rhabdomyosarcoma  40 0.0183570445
##                          cholangiocarcinoma   1 0.0004589261
##                    choroid plexus carcinoma  25 0.0114731528
##         desmoplastic small round cell tumor   4 0.0018357045
##      dysembryoplastic neuroepithelial tumor  13 0.0059660395
##                  embryonal rhabdomyosarcoma  42 0.0192748967
##  embryonal tumor with multilayered rosettes   1 0.0004589261
##                                  ependymoma  98 0.0449747591
##                         epithelioid sarcoma   1 0.0004589261
##                               Ewing sarcoma  70 0.0321248279
##      fibrolamellar hepatocellular carcinoma   3 0.0013767783
##                                fibromatosis   1 0.0004589261
##              gastrointestinal stromal tumor   4 0.0018357045
##                             germ cell tumor   1 0.0004589261
##                     glioblastoma multiforme  29 0.0133088573
##                                      glioma 193 0.0885727398
##                              hepatoblastoma   1 0.0004589261
##                    hepatocellular carcinoma   3 0.0013767783
##                 hypereosinophillic syndrome   1 0.0004589261
##            juvenile myelomonocytic leukemia   1 0.0004589261
##                                    leukemia   1 0.0004589261
##                                    lymphoma  49 0.0224873795
##                             medulloblastoma 201 0.0922441487
##                                    melanoma   7 0.0032124828
##             melanotic neuroectodermal tumor   1 0.0004589261
##                                  meningioma   1 0.0004589261
##                             myofibromatosis   2 0.0009178522
##                    nasopharyngeal carcinoma   2 0.0009178522
##                               neuroblastoma  12 0.0055071134
##                    neuroendocrine carcinoma   1 0.0004589261
##                                not reported   2 0.0009178522
##                not reported - QC fail by PI   1 0.0004589261
##                                osteosarcoma 157 0.0720513997
##           ovarian serous cystadenocarcinoma   1 0.0004589261
##                                      PEComa   1 0.0004589261
##                    pleuropulmonary blastoma   1 0.0004589261
##                              retinoblastoma   2 0.0009178522
##                              rhabdoid tumor  65 0.0298301973
##                            rhabdomyosarcoma  53 0.0243230840
##    spindle cell/sclerosing rhabdomyosarcoma   3 0.0013767783
##          supratentorial embryonal tumor NOS  18 0.0082606700
##                            synovial sarcoma  22 0.0100963745
##                undifferentiated sarcoma NOS   3 0.0013767783
##       undifferentiated spindle cell sarcoma   2 0.0009178522
##                                 wilms tumor  11 0.0050481872
disease_freq_table <- counts_and_meta$disease %>%
  str_to_sentence %>%
  str_replace(" nos$", " NOS") %>%
  as.factor %>%
  fct_infreq %>%
  fct_lump(prop=0.01) %>%
  tabyl (var1 = "Disease") %>%
  adorn_pct_formatting(digits = 1)

colnames(disease_freq_table)[1]="Disease"

write_tsv(disease_freq_table, "results/disease_freq_table.txt")

disease_freq_table  %>%
  kable %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Disease n percent
Acute lymphoblastic leukemia 680 31.2%
Acute myeloid leukemia 221 10.1%
Medulloblastoma 201 9.2%
Glioma 193 8.9%
Osteosarcoma 157 7.2%
Acute megakaryoblastic leukemia 103 4.7%
Ependymoma 98 4.5%
Ewing sarcoma 70 3.2%
Rhabdoid tumor 65 3.0%
Rhabdomyosarcoma 53 2.4%
Lymphoma 49 2.2%
Embryonal rhabdomyosarcoma 42 1.9%
Alveolar rhabdomyosarcoma 40 1.8%
Glioblastoma multiforme 29 1.3%
Choroid plexus carcinoma 25 1.1%
Synovial sarcoma 22 1.0%
Other 131 6.0%
# many are leukemias
table(str_detect(counts_and_meta$disease, "leukemia"))
## 
## FALSE  TRUE 
##  1172  1007

Cohorts in dataset

library(ggthemes)

old_cohort_size_histogram <- ggplot(counts_and_meta) + 
  geom_histogram(aes(n_in_cohort, group=cohort_code), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
  theme_minimal(base_size = 20) +
  theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +  xlab("Cohort size") +
  ylab("Cohorts") + 
  theme(legend.position="none")

cohort_size_histogram <- ggplot(counts_and_meta %>%
  select(cohort_code, n_in_cohort) %>%
  distinct ) + 
  geom_histogram(aes(n_in_cohort), 
                 fill = "lightgrey", color = "black", stat = StatBin2,
                 breaks = seq(0,500, by=10)) +
  theme_minimal(base_size = 20) +
  theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +  
  xlab("Cohort size") +
  scale_y_continuous("Cohorts", breaks = seq(0, 8, 2)) + 
  theme(legend.position="none")

cohort_size_histogram

# for cohorts that are more than 75% one disease, color by that disease

n_distinct(counts_and_meta$cohort_code)
## [1] 48
sort(unique(counts_and_meta$n_in_cohort))
##  [1]   3   4   6   7   8  10  15  17  19  22  23  24  25  26  31  35  39
## [18]  43  44  54  56  62  63  65  78  82  84  88  89 103 127 137 141 337
counts_and_meta %>%
  select(cohort_code, n_in_cohort) %>%
  distinct %>%
  arrange(desc(n_in_cohort)) %>%
  kable
cohort_code n_in_cohort
Cohort_1 337
Cohort_2 141
Cohort_3 137
Cohort_4 127
Cohort_5 103
Cohort_6 89
Cohort_7 88
Cohort_8 84
Cohort_9 82
Cohort_10 78
Cohort_11 65
Cohort_12 63
Cohort_13 62
Cohort_14 56
Cohort_15 54
Cohort_16 44
Cohort_17 43
Cohort_18 39
Cohort_19 35
Cohort_20 31
Cohort_21 31
Cohort_22 26
Cohort_23 25
Cohort_24 25
Cohort_26 24
Cohort_25 24
Cohort_30 23
Cohort_27 23
Cohort_28 23
Cohort_29 23
Cohort_31 22
Cohort_32 19
Cohort_33 19
Cohort_34 17
Cohort_35 15
Cohort_36 10
Cohort_37 8
Cohort_38 8
Cohort_41 7
Cohort_39 7
Cohort_40 7
Cohort_42 6
Cohort_44 6
Cohort_43 6
Cohort_45 6
Cohort_47 4
Cohort_46 4
Cohort_48 3
counts_and_meta %>%
  select(cohort_code, n_in_cohort) %>%
  distinct %>%
  pull(n_in_cohort) %>%
  median
## [1] 24.5

repository supp table

repo_info <- read_tsv("inputs/repos.tsv")
## Parsed with column specification:
## cols(
##   repo_name = col_character(),
##   repo_abbrev = col_character(),
##   repo_url = col_character(),
##   source_accession_pattern = col_character()
## )
repo_info %>%
  rename(`Name`=repo_name, `Abbreviation`=repo_abbrev, URL = repo_url) %>%
  select(-source_accession_pattern) %>%
  kable %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Name Abbreviation URL
Short Read Archive SRA https://www.ncbi.nlm.nih.gov/sra
European Genome-phenome Archive EGA https://www.ebi.ac.uk/ega/home
St. Jude Cloud SJC https://www.stjude.cloud/
Kids First Data Portal KFDP https://portal.kidsfirstdrc.org/
Database of Genotypes and Phenotypes dbGaP https://www.ncbi.nlm.nih.gov/gap/

data access table

counts_and_meta %>% 
  select(cohort_code, cohort_name, doi_link, source_repository, repository_cohort_accession) %>%
#  group_by(across(c(-disease)))
  distinct %>%
  kable
cohort_code cohort_name doi_link source_repository repository_cohort_accession
Cohort_1 TARGET-10 NA dbGaP phs000218
Cohort_3 TARGET-20 NA dbGaP phs000218
Cohort_30 TARGET-21 NA dbGaP phs000218
Cohort_41 TARGET-30 NA dbGaP phs000218
Cohort_7 TARGET-40 NA dbGaP phs000218.v15.p5
Cohort_7 TARGET-40 NA dbGaP phs000218
Cohort_47 TARGET-50 NA dbGaP phs000218
Cohort_11 TARGET-52 NA dbGaP phs000218
Cohort_12 EGAD00001001098 http://dx.doi.org/10.1038/ng.3230 EGA EGAD00001001098
Cohort_27 EGAD00001000356 http://dx.doi.org/10.1016/j.cell.2013.09.042 EGA EGAD00001000356
Cohort_32 EGAD00001002680 http://dx.doi.org/10.1038/ncomms7302 EGA EGAD00001002680
Cohort_23 EGAD00001001666 http://dx.doi.org/10.1007/s00401-016-1539-z EGA EGAD00001001666
Cohort_26 phs000900.v1.p1 http://dx.doi.org/10.1038/nm.3855 dbGaP phs000900.v1.p1
Cohort_20 10.24370/SD_BHJXBDQK http://dx.doi.org/10.24370/SD_BHJXBDQK KFDB CBTTC-HGG-PA-01
Cohort_20 10.24370/SD_BHJXBDQK http://dx.doi.org/10.24370/SD_BHJXBDQK KFDB CBTTC_PNET_PA_01
Cohort_20 10.24370/SD_BHJXBDQK http://dx.doi.org/10.24370/SD_BHJXBDQK KFDB CBTTC-PolyA
Cohort_19 phs000699.v1.p1 http://dx.doi.org/10.1073/pnas.1419260111 dbGaP phs000699.v1.p1
Cohort_34 EGAD00001000158 http://dx.doi.org/10.1038/nature11327 EGA EGAD00001000158
Cohort_42 EGAD00001000328 http://dx.doi.org/10.1038/nature11284 EGA EGAD00001000328
Cohort_28 EGAD00001000617 http://dx.doi.org/10.1038/ng.2682, http://dx.doi.org/10.1016/j.cell.2013.09.042 EGA EGAD00001000617
Cohort_18 EGAD00001001620 http://dx.doi.org/10.1016/j.cell.2013.09.042, http://dx.doi.org/10.1038/nature11284 EGA EGAD00001001620
Cohort_25 EGAD00001000648 http://dx.doi.org/10.1016/j.cell.2013.09.042 EGA EGAD00001000648
Cohort_46 SRP006575 No publication SRA SRP006575
Cohort_44 SJC_Other https://doi.org/10.1038/ncomms7302 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_5 SJC_AMLM https://doi.org/10.1016/j.ccr.2012.10.007 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_5 SJC_AMLM NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_2 SJC_BALL NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_2 SJC_BALL https://doi.org/10.1056/NEJMoa1403088 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_16 SJC_CBF NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_24 SJC_CPC NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_37 SJC_E NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_10 SJC_EPD https://doi.org/10.1038/nature13109 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_10 SJC_EPD NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_21 SJC_ERG NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_21 SJC_ERG https://doi.org/10.1056/NEJMoa1403088 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_14 SJC_ETV https://doi.org/10.1038/ng.2611 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_14 SJC_ETV NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_9 SJC_HGG https://doi.org/10.1038/ng.2938 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_9 SJC_HGG NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_44 SJC_Other https://doi.org/10.1056/NEJMoa1403088 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_39 SJC_HYPO https://doi.org/10.1056/NEJMoa1403088 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_39 SJC_HYPO NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_43 SJC_INF NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_43 SJC_INF https://doi.org/10.1038/ng.3230 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_15 SJC_LGG https://doi.org/10.1038/ng.2611 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_15 SJC_LGG NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_38 SJC_MB NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_40 SJC_MEL NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_44 SJC_Other https://doi.org/10.1038/ng.3230 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_29 SJC_OS https://doi.org/10.1016/j.celrep.2014.03.003 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_29 SJC_OS https://doi.org/10.1016/j.celrep.2014.03.003, https://doi.org/10.1158/1541-7786.mcr-18-0620 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_22 SJC_PHALL NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_44 SJC_Other NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_17 SJC_RHB https://doi.org/10.1016/j.ccr.2013.11.002 SJC Pediatric Cancer Genome Project (PCGP)
Cohort_17 SJC_RHB NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_45 SJC_WLM NA SJC Pediatric Cancer Genome Project (PCGP)
Cohort_36 EGAD00001000826 http://dx.doi.org/10.1038/ncomms15936 EGA EGAD00001000826
Cohort_48 SRP040454 No publication SRA SRP040454
Cohort_8 phs000720.v2.p1 http://dx.doi.org/10.1158/2159-8290.CD-13-0639 dbGaP phs000720.v2.p1
Cohort_13 phs000768.v2.p1 http://dx.doi.org/10.1371/journal.pgen.1004475 dbGaP phs000768.v2.p1
Cohort_6 phs000673.v2.p1 http://dx.doi.org/10.1001/jama.2015.10080 dbGaP phs000673.v2.p1
Cohort_6 phs000673.v2.p1 http://dx.doi.org/10.1001/jama.2015.10080, http://dx.doi.org/10.1038/nature23306 dbGaP phs000673.v2.p1
Cohort_6 phs000673.v2.p1 NA dbGaP phs000673.v2.p1
Cohort_31 EGAD00001001927 http://dx.doi.org/10.1016/j.cell.2016.01.015 EGA EGAD00001001927
Cohort_4 EGAD00001003279 http://dx.doi.org/10.1038/nature22973 EGA EGAD00001003279
Cohort_35 SRP092501 http://dx.doi.org/10.1126/scitranslmed.aah6904 SRA SRP092501
Cohort_33 SRP126664 http://dx.doi.org/10.1016/j.ccell.2018.05.002 SRA SRP126664

Genders in dataset

table(counts_and_meta$gender, useNA = "always")
## 
##       female         male not reported      unknown         <NA> 
##          739         1010          310           82           38
# samples with reported genders
counts_and_meta %>% 
  filter(! is.na(gender)) %>%
  filter(! gender %in% c("not reported", "unknown")) %>%
  tabyl(gender) %>%
  adorn_totals
##  gender    n   percent
##  female  739 0.4225272
##    male 1010 0.5774728
##   Total 1749 1.0000000

Ages in dataset

table(counts_and_meta$pedaya, useNA = "always")
## 
##                  No Yes, age < 30 years                <NA> 
##                  51                2033                  95
# at least one target sample is >30

n_ped_aya <- sum(counts_and_meta$pedaya=="Yes, age < 30 years" | counts_and_meta$age_at_dx<30, na.rm = TRUE)
n_ped_aya
## [1] 2104
n_ped_aya/nrow(counts_and_meta)
## [1] 0.9655805
# more than 95% of which are pediatric <30; of the Datasets with specific ages at diagnosis reported, the median was 7

median(counts_and_meta$age_at_dx, na.rm = TRUE)
## [1] 8.96
table(is.na(counts_and_meta$age_at_dx))
## 
## FALSE  TRUE 
##  1739   440

Read lengths in dataset

seq_len_round_25 <- round(counts_and_meta$seq_length / 25) * 25
table(seq_len_round_25, useNA = "always")
## seq_len_round_25
##   25   50   75  100  125 <NA> 
##    4  238  366 1544   27    0
tabyl(seq_len_round_25)
##  seq_len_round_25    n     percent
##                25    4 0.001835704
##                50  238 0.109224415
##                75  366 0.167966957
##               100 1544 0.708581918
##               125   27 0.012391005
table(is.na(counts_and_meta$seq_length))
## 
## FALSE 
##  2179
table(counts_and_meta$seq_length)
## 
##   36   50   51   75   76   81  100  101  110  125 
##    4   11  227  279   40   47   98 1431   15   27
summary(counts_and_meta$seq_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   36.00   76.00  101.00   91.51  101.00  125.00
IQR(counts_and_meta$seq_length, na.rm = TRUE)
## [1] 25
 old_read_length_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(seq_length, group=cohort_code), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Sequence length") +
    ylab("Datasets")
 
 read_length_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(seq_length), 
                 fill = "lightgrey", color = "black", stat = StatBin2,
                 binwidth = 1) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Sequence length") +
    ylab("Datasets") +
   expand_limits(x=0)
  
read_length_histogram

subset(counts_and_meta, Mapped > Total_reads)
## # A tibble: 0 x 23
## # Groups:   cohort_code [0]
## # … with 23 variables: dataset_id <chr>, Total_reads <dbl>, Mapped <dbl>,
## #   MND <dbl>, MEND <dbl>, seq_length <dbl>, disease <chr>,
## #   age_at_dx <dbl>, pedaya <chr>, gender <chr>, cohort_code <chr>,
## #   cohort_name <chr>, doi_link <chr>, source_repository <chr>,
## #   repository_cohort_accession <chr>, repository_dataset_accession <chr>,
## #   genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## #   genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## #   genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## #   n_in_cohort <int>
subset(counts_and_meta, MND > Mapped)
## # A tibble: 0 x 23
## # Groups:   cohort_code [0]
## # … with 23 variables: dataset_id <chr>, Total_reads <dbl>, Mapped <dbl>,
## #   MND <dbl>, MEND <dbl>, seq_length <dbl>, disease <chr>,
## #   age_at_dx <dbl>, pedaya <chr>, gender <chr>, cohort_code <chr>,
## #   cohort_name <chr>, doi_link <chr>, source_repository <chr>,
## #   repository_cohort_accession <chr>, repository_dataset_accession <chr>,
## #   genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## #   genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## #   genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## #   n_in_cohort <int>
subset(counts_and_meta, MEND > MND)
## # A tibble: 0 x 23
## # Groups:   cohort_code [0]
## # … with 23 variables: dataset_id <chr>, Total_reads <dbl>, Mapped <dbl>,
## #   MND <dbl>, MEND <dbl>, seq_length <dbl>, disease <chr>,
## #   age_at_dx <dbl>, pedaya <chr>, gender <chr>, cohort_code <chr>,
## #   cohort_name <chr>, doi_link <chr>, source_repository <chr>,
## #   repository_cohort_accession <chr>, repository_dataset_accession <chr>,
## #   genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## #   genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## #   genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## #   n_in_cohort <int>

Summarize total read counts

min(counts_and_meta$Total_reads)
## [1] 206916
Total_read_counts <- counts_and_meta %>% 
  ungroup %>%
  mutate(dataset_value = Total_reads/1e6) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(Total_read_counts %>%
        select(-variance, -p25, -p75), digits = 1)
min max median iqr
0.2 668 61.2 53.3
total_read_count_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(Total_reads/1E6), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Total reads") +
    ylab("Datasets")

total_read_count_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate percentages

 read_counts_with_read_type_fractions <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(
    pct_unmapped_of_total = (Total_reads - Mapped) / Total_reads,
    pct_dupe_of_mapped = (Mapped - MND) / Mapped,
    pct_non_exonic_of_non_dupe = (MND - MEND) / MND
  )

read_type_fractions_long <- read_counts_with_read_type_fractions %>%
  pivot_longer(cols = starts_with("pct_"), names_to = "read_type_fraction_name", values_to = "dataset_value") 

Summarize read type fractions

comparison_of_distributions <- read_type_fractions_long %>% 
  group_by(read_type_fraction_name) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(comparison_of_distributions %>%
      mutate_if(is.double, ~100*.), digits = 3)
read_type_fraction_name variance min max median p25 p75 iqr
pct_dupe_of_mapped 5.884 2.853 99.997 26.887 13.450 43.033 29.583
pct_non_exonic_of_non_dupe 4.094 4.495 97.000 24.769 16.189 37.277 21.088
pct_unmapped_of_total 0.605 0.710 76.677 3.414 2.740 6.006 3.266
kable(comparison_of_distributions %>%
        select(-variance, -p25, -p75) %>%
      mutate_if(is.double, ~100*.), digits = 0)
read_type_fraction_name min max median iqr
pct_dupe_of_mapped 3 100 27 30
pct_non_exonic_of_non_dupe 4 97 25 21
pct_unmapped_of_total 1 77 3 3

statements about read type fractions

Overview

comparison_of_distributions
## # A tibble: 3 x 8
##   read_type_fraction_na… variance     min   max median    p25    p75    iqr
##   <chr>                     <dbl>   <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 pct_dupe_of_mapped      0.0588  0.0285  1.00  0.269  0.135  0.430  0.296 
## 2 pct_non_exonic_of_non…  0.0409  0.0450  0.970 0.248  0.162  0.373  0.211 
## 3 pct_unmapped_of_total   0.00605 0.00710 0.767 0.0341 0.0274 0.0601 0.0327

unmapped

# in 77 datasets, more than 25% of reads are unmapped
table(read_counts_with_read_type_fractions$pct_unmapped_of_total>0.25)
## 
## FALSE  TRUE 
##  2102    77

duplicate

# 426 datasets have more than 50% duplicates (Fig 2A).
read_counts_with_read_type_fractions %>%
  mutate(above_50 = pct_dupe_of_mapped>0.50) %>%
  tabyl(above_50)
##  above_50    n   percent
##     FALSE 1753 0.8044975
##      TRUE  426 0.1955025
fiftypct <- read_counts_with_read_type_fractions %>%
  group_by(cohort_code) %>%
  summarize(has_a_dataset_with_more_than_50pct_dupes = any(pct_dupe_of_mapped>0.50),
            median_pct_dupe_of_mapped = median(pct_dupe_of_mapped)) %>%
  mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
         median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_dupes)


fiftypct %>% tabyl(median_lt_50)
##  median_lt_50  n   percent
##         FALSE  7 0.1458333
##          TRUE 41 0.8541667
fiftypct %>% tabyl(has_a_dataset_with_more_than_50pct_dupes)
##  has_a_dataset_with_more_than_50pct_dupes  n percent
##                                     FALSE 15  0.3125
##                                      TRUE 33  0.6875
fiftypct %>% tabyl(median_lt_50_and_has_a_dataset)
##  median_lt_50_and_has_a_dataset  n   percent
##                           FALSE 22 0.4583333
##                            TRUE 26 0.5416667

exonic

read_counts_with_read_type_fractions %>%
  mutate(above_50 = pct_non_exonic_of_non_dupe>0.50) %>%
  tabyl(above_50)
##  above_50    n   percent
##     FALSE 1849 0.8485544
##      TRUE  330 0.1514456
fiftypcte <- read_counts_with_read_type_fractions %>%
  group_by(cohort_code) %>%
  summarize(has_a_dataset_with_more_than_50pct_ne = any(pct_non_exonic_of_non_dupe>0.50),
            median_pct_dupe_of_mapped = median(pct_non_exonic_of_non_dupe)) %>%
  mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
         median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_ne)


fiftypcte %>% tabyl(median_lt_50)
##  median_lt_50  n percent
##         FALSE  3  0.0625
##          TRUE 45  0.9375
fiftypcte %>% tabyl(has_a_dataset_with_more_than_50pct_ne)
##  has_a_dataset_with_more_than_50pct_ne  n percent
##                                  FALSE 36    0.75
##                                   TRUE 12    0.25
fiftypcte %>% tabyl(median_lt_50_and_has_a_dataset)
##  median_lt_50_and_has_a_dataset  n percent
##                           FALSE 39  0.8125
##                            TRUE  9  0.1875

Correlation of depth of sequencing to duplicate read fraction

Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?

cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads, 
    read_counts_with_read_type_fractions$pct_dupe_of_mapped,
    method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34

Plot correlation of depth of sequencing to duplicate read fraction

this_data <- read_counts_with_read_type_fractions %>%
  select(Total_reads, pct_dupe_of_mapped)
## Adding missing grouping variables: `cohort_code`
this_plot_title <- paste("Correlation of Total_reads and % Duplicates")

these_stats <- this_data  %>%
  ungroup %>%
  summarize(r_corr = round(cor(Total_reads, pct_dupe_of_mapped), 2))

ggplot(this_data,
       aes(x=Total_reads/1E6, y=pct_dupe_of_mapped)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Total_reads/1E6),
             y=max(this_data$pct_dupe_of_mapped),
             hjust = 1, vjust = 1
             ) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  ggtitle(this_plot_title)  + 
  theme(legend.position="none", plot.title=element_text(size=22)) +
  xlab("Read counts (million)") +
  ylab("% Duplicates")
## `geom_smooth()` using formula 'y ~ x'

Generate schematic defining read types

colors_for_read_types <- these_colors_with_MEND
names(colors_for_read_types) <- gsub(" reads", "", names(these_colors_with_MEND))

colors_for_read_types <- c(colors_for_read_types, 
                           "Genome" = "darkblue",
                           "Exon" = "darkblue",
                           "Aligned reads" = "darkgrey")

plot1_colnames <- c("label", "x1", "x2", "y1", "y2")

read_type_schematic_data_as_vector <- c("Duplicate", 3, 4, 5, 6, 
                       "MEND", 3, 4, 4, 5, 
                       "MEND", 2.5, 3.5, 3, 4, 
                       "Non-exonic", 4.5, 5.5, 3, 4,
                       "Unmapped", 6, 7, 3, 4,
                       "Exon", 2.5, 4, 1, 2,
                       "Aligned reads", 2.8, 2.8, 6, 7)

rows_in_schematic <- length(read_type_schematic_data_as_vector)/length(plot1_colnames)

read_type_schematic_data <- matrix(read_type_schematic_data_as_vector, 
                     ncol = length(plot1_colnames), byrow = TRUE,
                     dimnames = list(c(1:rows_in_schematic), plot1_colnames)) %>%
  as_tibble %>%
  type_convert() %>%
  mutate(
    base_color = colors_for_read_types[match(label, names(colors_for_read_types))],
    border_color = case_when(
      label %in% c("Aligned reads", "Genome", "MEND", "Exon") ~ NA_character_,
      TRUE ~ base_color),
    fill_color = ifelse(label %in% c("MEND", "Exon"),  base_color, "white"),
    text_color = case_when(
      label %in% c("MEND", "Exon")~  "white",
      TRUE ~ base_color),
    mid_bar_x = map2_dbl(x1, x2,  function(x, y) mean(c(x,y))),
    mid_bar_y = map2_dbl(y1, y2,  function(x, y) mean(c(x,y)))
  )
## Parsed with column specification:
## cols(
##   label = col_character(),
##   x1 = col_double(),
##   x2 = col_double(),
##   y1 = col_double(),
##   y2 = col_double()
## )
padding_size = 0.35
end_of_box <- sort(read_type_schematic_data$x2,partial=length(read_type_schematic_data$x2)-1)[length(read_type_schematic_data$x2)-1] + padding_size
read_type_schematic <- ggplot(read_type_schematic_data, 
                              aes(xmin=x1, xmax=x2, ymin=y1, ymax = y2, 
                                  fill = fill_color, color = border_color)) +
  geom_rect() +
  
  geom_segment(x=min(read_type_schematic_data$x1-padding_size), y=1.5, xend = 5.75, yend = 1.5, color = "darkblue") +   
  geom_text(aes(label = label, x = mid_bar_x, y=mid_bar_y,
                color = text_color), 
            size = 4,
            fontface = "bold") +
  geom_rect(aes(xmin = min(x1 - padding_size), 
                ymin = 2.5, 
                xmax = end_of_box,
                ymax=max(y2 + padding_size)), 
            fill = NA, color = "darkgrey", size = 0.25) +
  scale_fill_identity() +
  scale_color_identity() +
  theme_void() + 
  theme(
    panel.grid.major.x = element_line(color="lightgrey", size=0.2),
    ) + 
 scale_x_continuous(
   breaks = seq(min(read_type_schematic_data$x1) - padding_size + 0.25, end_of_box , 0.25))

read_type_schematic

Generate schematic defining read type fractions

stat_names <- c("pct_unmapped_of_total",  "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_labels <- c("Unmapped",  "Duplicate", "Non-exonic") 
complement_stat_names <- c("Mapped reads",  "Non-duplicate reads", "Exonic reads") 
divisor_name = c("total", "mapped", "non-duplicate")

comparison_of_remaining <- read_type_fractions_long %>% 
  mutate(read_type_fraction_name = complement_stat_names[match(read_type_fraction_name, stat_names)]) %>%
  group_by(read_type_fraction_name) %>%
  summarize(min = round(1-max(dataset_value), 2),
    max = round(1-min(dataset_value), 2),
    median = round(1-median(dataset_value), 2),
    p25 = round(1-quantile(dataset_value, 0.25), 2),
    p75 = round(1-quantile(dataset_value, 0.75), 2)) %>%
      mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; x͂=",100* median, "%)"))

positive_bars <- comparison_of_remaining %>%
  select(-min, -max, -p25, -p75) %>%
  mutate(read_type_fraction_label = read_type_fraction_name, 
         position = "1") %>%
  rename(abs_median = median)

negative_bars <- tibble(read_type_fraction_name = positive_bars$read_type_fraction_name,
                        read_type_fraction_label = stat_labels[match(read_type_fraction_name, complement_stat_names)],
                        bar_label = read_type_fraction_label,
                        abs_median = 1-positive_bars$abs_median,
                        position = "2")

total_bar <- tibble(read_type_fraction_name = "Total reads",
                    read_type_fraction_label = read_type_fraction_name,
                    bar_label = "Total reads\n(100% of reads)",
                    abs_median = 1,
                    position = "1"
                    )


MEND_stats_of_total <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(
    pct_MEND_of_total = (Total_reads - MEND) / Total_reads
  ) %>%
  ungroup %>% 
  summarize(min = round(1-max(pct_MEND_of_total), 2),
    max = round(1-min(pct_MEND_of_total), 2),
    median = round(1-median(pct_MEND_of_total), 2))

MEND_bar <- tibble(read_type_fraction_name = "MEND reads",
                    read_type_fraction_label = read_type_fraction_name,
                    bar_label = with(MEND_stats_of_total, paste0 ("MEND reads\n(", 100*min, "-", 100*max, "% of total reads; x͂=",100* median, "%)")),
                    abs_median = 1,
                    position = "1"
                    )

bar_names_in_order <- c("Total reads", "Mapped reads", "Non-duplicate reads", "Exonic reads", "MEND reads")
  
all_bars <- bind_rows(positive_bars, negative_bars, total_bar, MEND_bar) %>%
  mutate(read_type_fraction_name = factor(read_type_fraction_name, 
                                          levels = bar_names_in_order))

cat_space <- all_bars %>% filter(position == 1) %>%
  arrange(read_type_fraction_name) %>%
  select(read_type_fraction_name, abs_median) %>%
  ungroup %>%
  mutate(lagged1_abs_median = lag(abs_median, default =1),
         lagged2_abs_median = lag(abs_median, n=2, default =1),
         category_space = lagged1_abs_median*lagged2_abs_median
         ) %>%
  select(read_type_fraction_name, category_space)

these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")


all_bars_rel <- left_join(all_bars, cat_space, by = "read_type_fraction_name") %>%
  mutate(rel_median=ifelse(read_type_fraction_name == "MEND reads",
                           abs_median[read_type_fraction_label=="Exonic reads"]*
                             category_space[read_type_fraction_label=="Exonic reads"], 
                           abs_median*category_space),
         read_type_fraction_name = factor(read_type_fraction_name, levels = rev(bar_names_in_order)),
         position = factor(position, levels = rev(sort(unique(position)))),
         bar_color = these_colors_with_MEND[match(read_type_fraction_name, names(these_colors_with_MEND))],
         fill_val = ifelse(position == 1, bar_color, NA),
         border_color_val = bar_color,
         text_color = ifelse(position ==1, "white", "black"),
font_size = ifelse(abs_median<0.1, 3,3.5),
text_angle = ifelse(abs_median<0.1, 90,0)
         ) 
read_type_fractions_schematic <- ggplot(all_bars_rel, aes(y=read_type_fraction_name,
                x=rel_median, group = position)) + 
   geom_col(aes(fill = fill_val,
                color = border_color_val
                )) +
   geom_text(aes(label=bar_label,
                 color = text_color,
                 size = font_size,
                 angle= text_angle), 
             position = position_stack(vjust = .5),
              fontface = "bold") +
   scale_fill_identity() +
   scale_color_identity()  +
  scale_size_identity() + 
   theme_void() +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) 


ggplot(all_bars_rel, aes(y=read_type_fraction_name,
                x=rel_median, group = position)) + 
   geom_col(aes(fill = fill_val,
                color = border_color_val
                )) +
   geom_text(aes(label=read_type_fraction_label,
                 color = text_color,
                 size = font_size,
                 angle = text_angle), 
             position = position_stack(vjust = .5),
              fontface = "bold") +
   scale_fill_identity() +
   scale_color_identity()  +
  scale_size_identity() + 
   theme_void() +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

read_type_fractions_schematic

figure_name <- "f1"

right_side <- plot_grid(read_type_schematic,
read_type_fractions_schematic,
nrow=2,
rel_heights = c(1, 2),
labels = c("C", "D"))

left_side <- plot_grid(cohort_size_histogram,
read_length_histogram,
nrow=2,
labels = c("A", "B"))

plot_grid(left_side,
right_side,
ncol = 2) +
   draw_label(paste(figure_name, Sys.time()),
              x = 0, y = 0.02, hjust = 0, size = 6) 

stat_names <- c("pct_unmapped_of_total",  "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_label <- c("% unmapped", "% duplicate \n(of mapped)", "% non-exonic \n(of non-duplicate)")
names(stat_label) <- stat_names

Plot read fractions

colors_for_stat_names <- these_colors[c("Mapped reads", "Non-duplicate reads","Exonic reads")]
names(colors_for_stat_names) <- stat_names

these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
names(these_colors) <-  c("not assigned", "Total reads", "Non-exonic reads",  "Exonic reads", "Duplicate reads",  "Non-duplicate reads",  "Unmapped reads", "Mapped reads")

# 6 x 11 was a good ratio, but the text was too small

read_type_fractions_long_for_histogram <- read_type_fractions_long %>%
  mutate(read_type_fraction_name = factor(read_type_fraction_name,
                                          levels = stat_names))

read_type_fractions_histogram <-  ggplot(read_type_fractions_long_for_histogram) +
    geom_histogram(aes(x = dataset_value, color = read_type_fraction_name), 
                   fill = "white", stat = StatBin2)  +
  scale_color_manual(values = colors_for_stat_names) +
  facet_wrap(~read_type_fraction_name, 
             nrow = 1, 
             scales = "free_y",
             labeller = labeller(
               read_type_fraction_name = stat_label
               )
             ) +
  ylab("Datasets") +
  xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_histogram$dataset_id)),  " datasets")) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
    theme(strip.text.x = element_text(size = 12, face = "bold")) +
    theme(legend.position = "none")

read_type_fractions_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate MEND reads as a fraction of total

read_counts_with_MEND_fractions <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(pct_MEND_of_total = MEND /Total_reads)

ggplot(read_counts_with_MEND_fractions) + 
  geom_histogram(aes(pct_MEND_of_total), stat = StatBin2) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

MEND_pct <- read_counts_with_MEND_fractions %>% 
  ungroup %>%
  mutate(dataset_value = pct_MEND_of_total) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(MEND_pct %>%
        select(-variance, -p25, -p75) %>%
      mutate_if(is.double, ~100*.), digits = 0)
min max median iqr
0 79 50 31
head(sort(read_counts_with_MEND_fractions$pct_MEND_of_total))
## [1] 0.0000267009 0.0002533711 0.0003651723 0.0003767257 0.0003997294
## [6] 0.0004279584
sum(read_counts_with_MEND_fractions$pct_MEND_of_total<0.25)
## [1] 355
read_counts_with_MEND_fractions %>%
  ungroup %>%
  mutate(pct_under_25 = pct_MEND_of_total<0.25) %>%
  tabyl(pct_under_25) %>%
  adorn_totals()
##  pct_under_25    n   percent
##         FALSE 1824 0.8370812
##          TRUE  355 0.1629188
##         Total 2179 1.0000000
read_counts_with_MEND_fractions %>%
  ungroup %>%
  filter(Mapped > 30E6) %>%
  mutate(pct_under_25 = pct_MEND_of_total<0.25) %>%
  tabyl(pct_under_25) %>%
  adorn_totals()
##  pct_under_25    n   percent
##         FALSE 1739 0.8368624
##          TRUE  339 0.1631376
##         Total 2078 1.0000000

Show per-cohort distribution of read_type_fractions

figure_name <- "fracs_by_group"

read_type_fractions_long_for_boxplot <- read_type_fractions_long_for_histogram %>%
  ungroup %>%
  mutate(boxplot_label = fct_reorder(paste0(as.character(cohort_code), ", n=", n_in_cohort), n_in_cohort))


read_type_fractions_per_cohort_boxplot <- ggplot(read_type_fractions_long_for_boxplot) +
  geom_boxplot(aes(x = dataset_value, y=boxplot_label), outlier.size = 0.5)  +
  facet_wrap(~read_type_fraction_name, 
             nrow = 1, 
             labeller = labeller(
               read_type_fraction_name = stat_label
             )
  ) +
  ylab("Cohorts") +
  xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_boxplot$dataset_id)),  " datasets")) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  theme(strip.text.x = element_text(size = 12, face = "bold")) +
  theme(legend.position = "none")


read_type_fractions_per_cohort_boxplot

Characterize Cohort 4

read_type_names <- c("Total_reads", "Mapped",  "MND", "MEND")
this_cohort_name <- "EGAD00001003279"

this_cohort_code <- subset(read_counts_with_read_type_fractions, cohort_name == this_cohort_name)$cohort_code[1]

this_cohort_code
## [1] "Cohort_4"

in MS

 read_counts_with_read_type_fractions %>%
  filter(cohort_code == this_cohort_code) %>%
   mutate(gt_98 = pct_dupe_of_mapped > 0.98) %>%
   tabyl(gt_98) %>%
   add_totals_row
## Warning: 'add_totals_row' is deprecated.
## Use 'adorn_totals("row")' instead.
## See help("Deprecated")
##  gt_98   n   percent
##  FALSE  55 0.4330709
##   TRUE  72 0.5669291
##  Total 127 1.0000000
read_counts_with_read_type_fractions %>%
  filter(cohort_code == this_cohort_code) %>%
  filter(pct_dupe_of_mapped > 0.98) %>%
  pull(Total_reads) %>% min
## [1] 178294341

other

 counts_and_meta_long <- counts_and_meta  %>%
  pivot_longer(cols = c(Total_reads, Mapped, MEND, MND), names_to = "read_type_name", values_to = "dataset_value")  %>%
   ungroup %>%
   mutate(read_type_name = factor(read_type_name, levels = read_type_names))

counts_and_meta_long_one_proj <-  counts_and_meta_long %>%
  filter(cohort_code == this_cohort_code)

table(counts_and_meta_long_one_proj$read_type_name)
## 
## Total_reads      Mapped         MND        MEND 
##         127         127         127         127
# 78 datasets in the cohort have fewer than 0.2 million MEND reads. 
sum((counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND"))$dataset_value < 200000)
## [1] 78
those_datasets <- counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND", dataset_value < 200000) %>% pull(dataset_id)


summary((counts_and_meta_long_one_proj %>% filter(read_type_name=="Total_reads", dataset_id %in% those_datasets))$dataset_value)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 178294341 199916664 208667776 211606891 223832191 247618832
max_plots_to_include <- 10
n_datasets_to_plot <- ifelse(n_distinct(counts_and_meta_long_one_proj$dataset_id)>max_plots_to_include, 
                         max_plots_to_include, 
                         n_distinct(counts_and_meta_long_one_proj$dataset_id))
plot_word <- ifelse(n_datasets_to_plot==max_plots_to_include, max_plots_to_include, paste("all", n_distinct(counts_and_meta_long_one_proj$dataset_id)))

set.seed(100)
datasets_to_plot <- sample(unique(counts_and_meta_long_one_proj$dataset_id), n_datasets_to_plot)

ggplot(subset(counts_and_meta_long_one_proj, dataset_id %in% datasets_to_plot)) + 
  geom_col(aes(x=read_type_name, y=dataset_value/1e6, fill = read_type_name)) +
  facet_wrap(~dataset_id, ncol = 1, strip.position="right")  +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  theme(strip.text.y = element_text(angle = 0)) +
  scale_fill_brewer(palette = "Set1")  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  ggtitle(paste("Read counts for", plot_word, "datasets from", this_cohort_code)) +
  coord_flip()

ggplot(counts_and_meta_long_one_proj %>% filter(read_type_name == "MEND")) + 
  geom_histogram(aes(dataset_value/1e6), stat = StatBin2) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  ggtitle(paste("Distribution of MEND counts in", this_cohort_code))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

bl <- colorRampPalette(c("navy","royalblue","lightskyblue"))(200)                      
re <- colorRampPalette(c("mistyrose", "red2","darkred"))(200)


ggplot(counts_and_meta %>% 
         filter(cohort_code == this_cohort_code) %>%
         mutate(pct_dupes = (Mapped - MND)/Mapped)) + 
  geom_point(aes(x=MEND/1e6, y=pct_dupes, fill = Total_reads/1e6), shape=21, color = "black") +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  scale_fill_gradientn(colours = c(bl,"white", re)) +
  ggtitle(paste("Relationship of dupe fraction to MEND count in", this_cohort_code), "For detecting whether datasets with less info were sequenced more deeply. \nNormally, higher depth datasets have more MEND reads and more duplicate \nreads than the same dataset sequenced at a lower total depth.")

counts_and_meta_long_one_proj %>%
  select(seq_length, dataset_id) %>%
  distinct %>%
  tabyl(seq_length)
##  seq_length   n percent
##          51 127       1

Expressed genes

What is the distribution of measured genes?

gene_counts_long <- counts_and_meta %>%
  pivot_longer(cols = starts_with("genes_"), names_to = "Expression_threshold")  %>%
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  


ggplot(gene_counts_long) + 
  geom_histogram(aes(x=value/1e3, fill = Expression_threshold), stat = StatBin) +
  facet_wrap(~Expression_threshold)  +
    theme_minimal() +
  scale_fill_brewer(palette = "Set1") +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
#  theme_linedraw(base_size = 20) +
    theme(legend.position = "none") +
  # scale_fill_brewer(palette = "Set1") +
  ylab("Datasets") +
  xlab("Measured genes") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What are the exact lowest gene counts?

sort(counts_and_meta$genes_expressed_above_0) [1:100]
##   [1]     9     9    12    12    13    14    14    15    16    16    17
##  [12]    18    18    18    18    18    18    19    19    19    19    20
##  [23]    20    20    20    20    20    21    21    21    21    21    21
##  [34]    22    22    22    23    23    23    23    23    24    24    24
##  [45]    24    24    24    24    25    25    25    26    26    26    26
##  [56]    27    27    27    27    28    28    28    28    29    29    29
##  [67]    29    29    30    30    31    31    32    33    35    36    37
##  [78]    40  6098  7107 11830 15323 16793 16837 16943 17233 17531 18288
##  [89] 18588 18704 19626 19693 19749 19781 19942 19963 20063 20120 20203
## [100] 20247

correlations with all datasets

library(corrr)


counts_and_meta %>% 
    ungroup %>% 
    select_if(is.double) %>%
       correlate() %>%
  filter(rowname %in% read_type_names) %>%
  rename(Read_type_count=rowname) %>% 
  mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
  arrange(Read_type_count) %>%
  select(c(Read_type_count, starts_with("genes_expressed"))) %>%
  kable(caption = "Correlations with all datasets", digits = 2)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all datasets
Read_type_count genes_expressed_above_0 genes_expressed_above_1 genes_expressed_above_2 genes_expressed_above_3 genes_expressed_above_4 genes_expressed_above_5
Total_reads -0.20 -0.40 -0.40 -0.40 -0.39 -0.39
Mapped -0.19 -0.41 -0.41 -0.41 -0.40 -0.39
MND 0.51 0.24 0.20 0.18 0.17 0.16
MEND 0.48 0.27 0.26 0.26 0.26 0.26

correlations with all but low gene count datasets

counts_and_meta %>% 
  filter(genes_expressed_above_0 > min_genes_gt_0) %>%
    ungroup %>% 
    select_if(is.double) %>%
       correlate() %>%
  filter(rowname %in% read_type_names) %>%
  rename(Read_type_count=rowname) %>% 
  mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
  arrange(Read_type_count) %>%
  select(c(Read_type_count, starts_with("genes_expressed"))) %>%
  kable(caption = "Correlations with all but low gene count datasets", digits = 2)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all but low gene count datasets
Read_type_count genes_expressed_above_0 genes_expressed_above_1 genes_expressed_above_2 genes_expressed_above_3 genes_expressed_above_4 genes_expressed_above_5
Total_reads 0.13 -0.26 -0.28 -0.28 -0.28 -0.28
Mapped 0.21 -0.25 -0.26 -0.27 -0.27 -0.27
MND 0.51 0.04 0.02 0.00 0.00 0.00
MEND 0.44 0.08 0.09 0.11 0.11 0.12

Plot correlations betweeen gene counts and read types

Groom data

expr_gene_and_read_counts_to_plot <- counts_and_meta %>%
  pivot_longer (cols = c(MND, MEND, Total_reads, Mapped), names_to = "Read_type", values_to = "Read_count") %>%
  pivot_longer (cols = starts_with("genes_expressed"), names_to = "Expression_threshold", values_to = "Gene_count") %>%
  mutate(Read_type = factor(Read_type, levels = read_type_names))

Identify datasets with unusual gene counts or read depths

datasets_with_normal_expressed_gene_numbers <- counts_and_meta %>%
  filter(genes_expressed_above_0 > min_genes_gt_0) %>%
  pull(dataset_id)

datasets_with_outlier_gene_counts <- counts_and_meta %>%
  ungroup %>%
  filter(is_outlier(genes_expressed_above_0)) %>%
  pull(dataset_id) 

datasets_with_outlier_depths <- counts_and_meta %>%
  ungroup %>%
  filter(is_outlier(Total_reads)) %>%
  pull(dataset_id) 

Function for calculating gene count correlations and plotting results

plot_gene_count_correlations <- function(this_data = expr_gene_and_read_counts_to_plot, this_plot_title = "gene count correlations"){

this_data <- this_data %>% 
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  
  
  
these_stats <- this_data  %>%
group_by(Read_type, Expression_threshold) %>%
  summarize(r_corr = round(cor(Gene_count, Read_count), 2)) 

# unique(this_data$Read_type)
# dput(unique(this_data$Read_type))
# these_colors_with_MEND

colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00", 
"MND"="#E31A1C", "MEND"="#000000")

ggplot(this_data,
       aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black") +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Read_count/1E6),
             y=max(this_data$Gene_count/1e3),
             hjust = 1, vjust = 1
             ) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  scale_color_manual(values = colors_for_correlation_plot) +
  facet_grid(Read_type~Expression_threshold) +
  ggtitle(this_plot_title)  + 
  theme(legend.position="none") +
  xlab("Read counts (million)") +
  ylab("Gene counts (thousand)")

}

for all datasets

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot,
                             "Correlations calculated across all datasets")
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(dataset_id %in% datasets_with_normal_expressed_gene_numbers),
  paste("Correlations in datasets with more than", min_genes_gt_0, "genes expressed above 0"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_gene_counts),
  paste("Correlations calculated across all but outlier gene count datasets"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_depths),
  paste("Correlations calculated across all but outlier read depth datasets"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_depths, 
         ! dataset_id %in% datasets_with_outlier_gene_counts
         ),
  paste("Correlations calculated across all but datasets with outlier read depth or gene count"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(
  expr_gene_and_read_counts_to_plot %>% 
    filter(disease ==  "acute lymphoblastic leukemia"),
  "Correlations calculated among acute lymphoblastic leukemia datasets")
## `geom_smooth()` using formula 'y ~ x'

corr plot for figure

max_total_read_depth <- 250*1e6
print(min_genes_gt_0)
## [1] 100
sum(counts_and_meta$genes_expressed_above_0 <= min_genes_gt_0)
## [1] 78
sum(counts_and_meta$Total_reads > max_total_read_depth)
## [1] 105
# min_genes_gt_0
exclude_for_depth <- counts_and_meta %>%
  filter(Total_reads > max_total_read_depth) %>%
  pull(dataset_id)

exclude_for_gene_count <- counts_and_meta %>%
  filter(genes_expressed_above_0 < min_genes_gt_0) %>%
  pull(dataset_id)


this_data <- expr_gene_and_read_counts_to_plot %>% 
      filter(
        Expression_threshold %in% paste0("genes_expressed_above_", 1:3),
       ! dataset_id %in% exclude_for_depth, 
        ! dataset_id %in% exclude_for_gene_count
      ) %>%
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  

n_distinct(this_data$dataset_id)
## [1] 1996
this_plot_title <- paste("Correlations calculated across all but datasets with excessive read depth or low gene count")

these_stats <- this_data  %>%
group_by(Read_type, Expression_threshold) %>%
  summarize(r_corr = round(cor(Gene_count, Read_count), 2))

colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00", 
"MND"="#E31A1C", "MEND"="#000000")

cor_plots_excluding_high_read_depth_and_low_gene_count_datasets <- ggplot(this_data,
       aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black") +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Read_count/1E6),
             y=max(this_data$Gene_count/1e3),
             hjust = 1, vjust = 1
             ) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5),
        strip.text = element_text(face = "bold")) +
  scale_color_manual(values = colors_for_correlation_plot) +
  facet_grid(Read_type~Expression_threshold) +
  theme(legend.position="none") +
  scale_x_continuous("Read counts (million)", n.breaks = 4) +
  ylab("Gene counts (thousand)") 


cor_plots_excluding_high_read_depth_and_low_gene_count_datasets
## `geom_smooth()` using formula 'y ~ x'

# Figure 2

f2a <- read_type_fractions_histogram +
            theme(strip.text.x = element_text(size = 10, face = "bold"),
                  axis.title.x=element_blank(),
                  axis.text.x=element_blank())

f2b <- read_type_fractions_per_cohort_boxplot +
            theme(strip.text.x = element_blank(),
                  strip.background = element_blank(),
                  strip.text = element_blank())

f2ab <- plot_grid(f2a, 
                  f2b,
                  ncol = 1,
                  rel_heights = c(1.5, 6),
                  align = "v",
                  labels = "AUTO")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
f2c <- plot_grid(cor_plots_excluding_high_read_depth_and_low_gene_count_datasets,
          labels = "C")
## `geom_smooth()` using formula 'y ~ x'
f2 <- plot_grid(f2ab,
                 f2c,
                 nrow=1,
                 rel_widths = c(4,2.3))

figure_name <- "f2"
f2 +   draw_label(paste(figure_name, Sys.time()),
             x = 0, y = 0.02, hjust = 0, size = 6) 

Plot sequence length against pct duplicates

ggplot(read_counts_with_read_type_fractions) + 
  geom_point(aes(x=seq_length, y=pct_dupe_of_mapped))

ggplot(read_counts_with_read_type_fractions) + 
  geom_boxplot(aes(x=seq_length, y=pct_dupe_of_mapped, group = seq_length))

Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?

cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads, 
    read_counts_with_read_type_fractions$pct_dupe_of_mapped,
    method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34

SessionInfo

pander(sessionInfo(), compact = FALSE)

R version 3.6.0 (2019-04-26)

Platform: x86_64-apple-darwin15.6.0 (64-bit)

locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8

attached base packages:

  • stats
  • graphics
  • grDevices
  • utils
  • datasets
  • methods
  • base

other attached packages:

  • corrr(v.0.4.0)
  • kableExtra(v.1.1.0)
  • pander(v.0.6.3)
  • RColorBrewer(v.1.1-2)
  • cowplot(v.1.0.0)
  • ggpubr(v.0.2.5)
  • magrittr(v.1.5)
  • ggthemes(v.4.2.0)
  • knitr(v.1.25)
  • viridis(v.0.5.1)
  • viridisLite(v.0.3.0)
  • janitor(v.1.2.0)
  • forcats(v.0.4.0)
  • stringr(v.1.4.0)
  • dplyr(v.0.8.5)
  • purrr(v.0.3.3)
  • readr(v.1.3.1)
  • tidyr(v.1.0.0)
  • tibble(v.3.0.1)
  • ggplot2(v.3.3.0)
  • tidyverse(v.1.2.1)

loaded via a namespace (and not attached):

  • Rcpp(v.1.0.3)
  • lubridate(v.1.7.4)
  • lattice(v.0.20-38)
  • assertthat(v.0.2.1)
  • digest(v.0.6.23)
  • utf8(v.1.1.4)
  • R6(v.2.4.1)
  • cellranger(v.1.1.0)
  • backports(v.1.1.5)
  • evaluate(v.0.14)
  • httr(v.1.4.1)
  • highr(v.0.8)
  • pillar(v.1.4.3)
  • rlang(v.0.4.6)
  • readxl(v.1.3.1)
  • rstudioapi(v.0.10)
  • Matrix(v.1.2-17)
  • rmarkdown(v.1.16)
  • splines(v.3.6.0)
  • labeling(v.0.3)
  • webshot(v.0.5.1)
  • munsell(v.0.5.0)
  • broom(v.0.5.2)
  • compiler(v.3.6.0)
  • modelr(v.0.1.5)
  • xfun(v.0.10)
  • pkgconfig(v.2.0.3)
  • mgcv(v.1.8-29)
  • htmltools(v.0.4.0)
  • tidyselect(v.1.1.0)
  • gridExtra(v.2.3)
  • fansi(v.0.4.0)
  • crayon(v.1.3.4)
  • withr(v.2.1.2)
  • grid(v.3.6.0)
  • nlme(v.3.1-141)
  • jsonlite(v.1.6)
  • gtable(v.0.3.0)
  • lifecycle(v.0.2.0)
  • scales(v.1.1.0)
  • cli(v.2.0.0)
  • stringi(v.1.4.3)
  • farver(v.2.0.1)
  • ggsignif(v.0.6.0)
  • xml2(v.1.2.2)
  • ellipsis(v.0.3.0)
  • generics(v.0.0.2)
  • vctrs(v.0.3.0)
  • tools(v.3.6.0)
  • glue(v.1.4.1)
  • hms(v.0.5.1)
  • yaml(v.2.2.0)
  • colorspace(v.1.4-1)
  • rvest(v.0.3.4)
  • haven(v.2.1.1)